#Import necessary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
import pickle
from keras.preprocessing import sequence
from keras.models import Sequential,load_model
from keras.layers import Input,Dense,Dropout, Embedding
from keras.layers import LSTM
from keras.optimizers import Adam
import warnings
import livelossplot
from keras.utils.vis_utils import model_to_dot
from IPython.display import SVG
warnings.filterwarnings("ignore")
#Funcation to load the train, labels and test files
def load_data():
features = pd.read_csv('data/features.csv')
labels = pd.read_csv('data/labels.csv')
test_data = pd.read_csv('data/test.csv')
return features,labels,test_data
# Function gets the dataset, preprocess it and sends back the final dataset
def get_dataset():
features,labels,test_data = load_data()
train_data,test_data = preprocess_data(features,labels,test_data)
X_train = train_data
Y_train = train_data.iloc[:,157:]
X_test = test_data
return X_train, Y_train,X_test
#Function implements preprocessing of removing null value records and reset index
def preprocess_data(features,labels,test_data):
train_data = pd.merge(features,labels,on='trackID',how="left")
train_data.dropna(inplace = True)
train_data.reset_index(drop=True, inplace=True)
test_data.dropna(inplace = True)
test_data.reset_index(drop=True, inplace=True)
return train_data, test_data
#Function to visualize and attain deeper knowledge on data
def genre_spread(data):
#Plot to visualize Percentage of different Genre types
fig, ax = plt.subplots(figsize=(20,10))
plot = sns.countplot(x='genre', palette="Blues_r", data=data, order = data['genre'].value_counts().index)
plt.title('Percentage of Genre type', fontsize=40)
for i in ax.patches:
ax.annotate(f'{i.get_height() * 100 / data.shape[0]:.2f}%',
(i.get_x() + i.get_width() / 2., i.get_height()),
ha='center',
va='center',
fontsize=15,
color='red',
xytext=(0,7),
textcoords='offset points')
print('Genre Classic pop and rock and folk generates 40% of our data. (Most Preferred)')
print('Genre jazz and blues are the least preferred genre')
def distribution_viz(data):
#Plot to visualize Distribution of column values by genre
#This is plotted only for columns which had proper meaning
plt.title("Distribution of values by genre", fontsize = 18)
for col in list(data.iloc[:, 3:9].columns):
for i in data['genre'].unique():
sns.kdeplot(data=data.loc[data['genre']==i, col], label=i)
plt.xlabel(col, fontsize = 20)
plt.legend()
plt.show()
def outlier_viz(data):
#Visualisation to study outliers in features
#This is plotted only for columns
for col in list(data.iloc[:, 3:10].columns):
fig, ax = plt.subplots(figsize=(10,5))
sns.boxplot(x=data[col])
plt.xlabel(col, fontsize = 20)
plt.show()
#Writing different functions for correlation, only to visualize in a better way in jupyter notebook
def corr_viz1(data):
#Correlation between all 154 continuous variables
correlations = data.iloc[:, 3:154].corr()
fig, ax = plt.subplots(figsize=(30,30))
sns.heatmap(correlations, vmax=1.0, center=0, fmt='.2f',
square=True, linewidths=.5, annot=True, cbar_kws={"shrink": .70})
plt.show()
#Better visualization of Correlation of features having high correlation in previous plot
def corr_viz2(data):
correlations = data.iloc[:, 37:67].corr()
fig, ax = plt.subplots(figsize=(30,30))
sns.heatmap(correlations, vmax=1.0, center=0, fmt='.2f',
square=True, linewidths=.5, annot=True, cbar_kws={"shrink": .70})
plt.show()
def corr_viz3(data):
correlations = data.iloc[:, 67:97].corr()
fig, ax = plt.subplots(figsize=(30,30))
sns.heatmap(correlations, vmax=1.0, center=0, fmt='.2f',
square=True, linewidths=.5, annot=True, cbar_kws={"shrink": .70})
plt.show()
def corr_viz4(data):
correlations = data.iloc[:, 97:127].corr()
fig, ax = plt.subplots(figsize=(30,30))
sns.heatmap(correlations, vmax=1.0, center=0, fmt='.2f',
square=True, linewidths=.5, annot=True, cbar_kws={"shrink": .70})
plt.show()
def corr_viz5(data):
correlations = data.iloc[:, 127:157].corr()
fig, ax = plt.subplots(figsize=(30,30))
sns.heatmap(correlations, vmax=1.0, center=0, fmt='.2f',
square=True, linewidths=.5, annot=True, cbar_kws={"shrink": .70})
plt.show()
#Function to plot the feature 'tempo's importance over the target value
def imp_fea_viz1(data):
plt.figure(figsize=(14,8))
plot = sns.barplot(x=data["genre"], y=data["tempo"],palette = 'husl')
plt.title('Impact of feature tempo on Genre type', fontsize=40)
ax = plot.axes
plt.show()
#Function to plot the feature 'vect_41's importance over the target value
def imp_fea_viz2(data):
plt.figure(figsize=(14,8))
plot = sns.barplot(x=data["genre"], y=data["vect_41"],palette = 'husl')
plt.title('Impact of feature vect_41 on Genre type', fontsize=40)
ax = plot.axes
plt.show()
#Function to plot the feature 'loudness's importance over the target value
def imp_fea_viz3(data):
plt.figure(figsize=(14,8))
plot = sns.barplot(x=data["genre"], y=data["loudness"],palette = 'husl')
plt.title('Impact of feature loudness on Genre type', fontsize=40)
ax = plot.axes
plt.show()
#Function to summarize the shapes of input and put dataframe before sending it to the target CNN model
def data_summary(X_train, Y_train, X_val, Y_val,X_test):
"""Summarize current state of dataset"""
print('Train features shape:', X_train.shape)
print('Train labels shape:', Y_train.shape)
print('Validation features shape:', X_val.shape)
print('Validation labels shape:', Y_val.shape)
print('Test features shape:', X_test.shape)
# Function gets a list of models to evaluate for feature importance
# This will later help to build the final Logistic Regression model
def get_models():
models = dict()
for i in range(3,154):
steps = [('norm', preprocessing.MinMaxScaler()),('pca', PCA(n_components=i)), ('m', LogisticRegression())]
models[str(i)] = Pipeline(steps=steps)
return models
# evaluate a given model using cross-validation for feature importance
def evaluate_model(model, X, y):
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
return scores
# explore dataset
features,labels,test_data = load_data()
features.shape
There are 8128 rows ans 157 columns in our features dataset
#Describe the dataset to know about the mean distribution
features.describe()
#Study of null values in the features data
features.info(verbose=True, null_counts=True)
As there are less than 5% of null values in every column, eliminate those records while data preprocessing
#Study of null values in the labels data
labels.info()
#Get to know the distribution of records across label values
#This will help us understand the skewness of data over one particular target value
labels['genre'].value_counts()
#load the processed data
X_train, Y_train, X_test = get_dataset()
X_train
genre_spread(X_train)
distribution_viz(X_train)
outlier_viz(X_train)
The above box plot, gives information about outliers, but it doesnt provide detailed information of the outlier datapoint.
X_train[(np.abs(stats.zscore(X_train.iloc[:,3:154])) < 3).all(axis=1)].iloc[:,3:154]
As the outliers contribute to 34% of our entire dataset, we maintain the outlier as it is. This should be informed to the respective business team / stakeholders in real life scenario
corr_viz1(X_train)
From the above correlation plot, we can see some columns are more correlated. To study more about those columns, I ran correlation for those specific columns
corr_viz2(X_train)
Now, we can clearly see the column 42 is highly positively correlated with columns 43-52 and negatively correlated with column 54
corr_viz4(X_train)
corr_viz5(X_train)
Similarly studied the correlation of other features, for dimentionality reduction. Next, I studied the effect of each feature over the target column. This helps to understand feature importance.
imp_fea_viz1(X_train)
imp_fea_viz2(X_train)
imp_fea_viz3(X_train)
We can see feature 'vect_31' has a huge potential on predicting 'jazz and blues' genre. To understand such feature importance and also to reduce dimentionality,Principle Component Analysis is done.
#Copy the data so that we use it later build final predictions results with trackid and title
X_train_data = X_train.copy()
Y_train_data = Y_train.copy()
X_train, X_val, Y_train, Y_val = train_test_split(X_train.iloc[:, 3:157].values, Y_train.iloc[:,-1].values, test_size=0.10, random_state=42)
# Function gets the models to evaluate for feature importance
models = get_models()
# Function evaluates the models and store results
results, names = list(), list()
for name, model in models.items():
scores = evaluate_model(model, X_train, Y_train)
results.append(scores)
names.append(name)
print('>%s %.3f (%.3f)' % (name, np.mean(scores), np.std(scores)))
# plot model performance for comparison
fig, ax = plt.subplots(figsize=(30,30))
plt.boxplot(results, labels=names, showmeans=True)
plt.xticks(rotation=45)
plt.show()
#Simple Logistic Regression is run with C=1
steps = [('norm', preprocessing.MinMaxScaler()),('pca', PCA(n_components=60)), ('m', LogisticRegression(C=1,multi_class = 'ovr'))]
models = Pipeline(steps=steps)
models.fit(X_train,Y_train)
m_predict = models.predict(X_val)
m_report ="""Calculated
Train Accuracy:{}
Validation Accuracy: {}
""".format(
models.score(X_train, Y_train),
metrics.accuracy_score(Y_val, m_predict))
print(m_report)
filename = 'LR_C1.bin'
pickle.dump(model, open(filename, 'wb'))
#Testing for different C values, (C value as hyper parameter tuning)
inv_log_likelihood_train=[]
inv_log_likelihood_val=[]
num_C = 10
C = [1.0] * num_C
for i in range(num_C):
C[i] = pow(10, i-3)
print("C-value:",C[i])
steps = [('norm', preprocessing.MinMaxScaler()),('pca', PCA(n_components=60)), ('m', LogisticRegression(C=C[i],solver = 'newton-cg',multi_class = 'multinomial'))]
multinom_newton_estimator = Pipeline(steps=steps)
multinom_newton_estimator.fit(X_train,Y_train)
m_predict = multinom_newton_estimator.predict(X_val)
predict_probability_train = multinom_newton_estimator.predict_proba(X_train)
predict_probability_test = multinom_newton_estimator.predict_proba(X_val)
inv_log_likelihood_train.append(metrics.log_loss(Y_train,predict_probability_train))
inv_log_likelihood_val.append(metrics.log_loss(Y_val,predict_probability_test))
plt.figure(figsize = (10, 10))
plt.xscale('log')
plt.plot(C, inv_log_likelihood_train, 'bo-',C,inv_log_likelihood_val,'ro-')
plt.legend(['Train Error','Test Error'])
plt.title('Change in Train and Test Error with Complexity Trade-off')
plt.xlabel('Complexity')
plt.ylabel('Error')
plt.show()
The above plot clearly showly that after C=10 there is not much of changes in the model. Hence we consider model will C = 10 for our classification.
#Building model with C = 10
steps = [('norm', preprocessing.MinMaxScaler()),('pca', PCA(n_components=60)), ('m', LogisticRegression(C=10,solver = 'newton-cg',multi_class = 'multinomial'))]
models = Pipeline(steps=steps)
models.fit(X_train,Y_train)
m_predict = models.predict(X_val)
m_report ="""Calculated
Train Accuracy:{}
Validation Accuracy: {}
""".format(
models.score(X_train, Y_train),
metrics.accuracy_score(Y_val, m_predict))
print(m_report)
filename = 'LR_C10.bin'
pickle.dump(models, open(filename, 'wb'))
#Getting the data ready NN model with LSTM classifier
X_train,Y_train,X_test = get_dataset()
Y_train = Y_train.astype({"genre":'category'})
Y_train_data = Y_train.copy()
Y_train = pd.get_dummies(Y_train)
X_train_data = X_train.copy()
X_train, X_val, Y_train, Y_val = train_test_split(X_train.iloc[:, 3:157].values, Y_train.values, test_size=0.10, random_state=42)
X_test = X_test.iloc[:, 3:157].values
X_train= np.reshape(X_train,(X_train.shape[0], 1, X_train.shape[1]))
X_val= np.reshape(X_val,(X_val.shape[0], 1, X_val.shape[1]))
X_test= np.reshape(X_test,(X_test.shape[0], 1, X_test.shape[1]))
data_summary(X_train, Y_train, X_val, Y_val,X_test)
#Building the NN model with LSTM classifier
plot_losses = livelossplot.PlotLossesKeras()
NUM_CATEGORIES = labels.genre.nunique()
#Defining the hyperparameters
epochs = 100
batch_size = 16
adam = Adam(lr=0.001, decay=0.000049, epsilon=1e-8)
model = Sequential()
model.add(LSTM(10, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(72, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dense(NUM_CATEGORIES, activation='softmax'))
#if we dont one-hot encoding use sparse_categorical_crossentropy as loss function
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,verbose = 0,callbacks =[plot_losses],validation_data=(X_val, Y_val))
model.save('NN_1.bin')
NN_model = load_model('NN_1.bin')
# Summary of the NN model with LSTM classifier
NN_model.summary()
#NN layer visualization
SVG(model_to_dot(NN_model,dpi=65).create(prog='dot', format='svg'))
train_scores = NN_model.evaluate(X_train, Y_train)
val_scores = NN_model.evaluate(X_val, Y_val)
X_train,Y_train,X_test = get_dataset()
X_train_data = X_train.copy()
Y_train_data = Y_train.copy()
X_train, X_val, Y_train, Y_val = train_test_split(X_train.iloc[:, 3:157].values, Y_train.iloc[:,-1].values, test_size=0.10, random_state=42)
filename = 'LR_C10.bin'
LR_model = pickle.load(open(filename, 'rb'))
result = LR_model.score(X_val, Y_val)
m_predict_train = LR_model.predict(X_train)
m_predict_train = LR_model.predict(X_train_data.iloc[:, 3:157].values)
train_pred_op = pd.DataFrame(m_predict_train,columns=['predicted_genre'])
train_pred_op['title'] = X_train_data.title.tolist()
train_pred_op['actual_genre'] = X_train_data.genre.tolist()
train_pred_op = train_pred_op[['title','actual_genre','predicted_genre']]
output_dir = 'outputs/LR/'
output_file = 'train_predictions.csv'
train_pred_op.to_csv(output_dir + output_file)
m_report ="""Calculated
Logistic Regression Train Accuracy:{}
Logistic Regression Validation Accuracy: {}
Neural Network LSTM Classifier Train Loss:{}
Neural Network LSTM Classifier Train Accuracy:{}
Neural Network LSTM Validation Loss:{}
Neural Network LSTM Validation Accuracy: {}
""".format(
models.score(X_train, Y_train),
metrics.accuracy_score(Y_val, m_predict),str(train_scores[0]),str(train_scores[1]),
str(val_scores[0]),str(val_scores[1])
)
print(m_report)
#Functions gets dataframe as input
def make_predictions(test_data,labels):
#Load the pre-trained models
LR_model_file = 'LR_C10.bin'
LR_model = pickle.load(open(LR_model_file, 'rb'))
NN_model_file = 'NN_1.bin'
NN_model = load_model('NN_1.bin')
#Predict target genres on test data using LR model
m_predict_test = LR_model.predict(test_data.iloc[:, 3:157].values)
print('Successfully predicted LR model')
#Store predicted classes along with its title name
test_pred_op = pd.DataFrame(m_predict_test,columns=['predicted_genre'])
test_pred_op['title'] = test_data.title.tolist()
test_pred_op['trackID'] = test_data.trackID.tolist()
test_pred_op = test_pred_op[['trackID','title','predicted_genre']]
print('Successfully stored LR model predictions')
#reshape input data for NN model
X_test = test_data.iloc[:, 3:157].values
X_test= np.reshape(X_test,(X_test.shape[0], 1, X_test.shape[1]))
#Predict target genres on test data using NN model
#This model outputs continuous values
NN_model_predictions = NN_model.predict_classes(X_test)
print('Successfully predicted NN model')
#Resturcture the prediction values so that it matches the genre class values
temp_labels = pd.DataFrame(labels['genre'].unique(),columns = ['predicted_genre'])
temp_labels =temp_labels.sort_values(by=['predicted_genre'], ascending=True)
temp = pd.DataFrame(NN_model_predictions,columns = ['predictions'])
NN_test_pred_op = pd.merge(temp,temp_labels,right_index=True, left_on='predictions')
NN_test_pred_op['title'] = test_data.title.tolist()
NN_test_pred_op['trackID'] = test_data.trackID.tolist()
NN_test_pred_op = NN_test_pred_op[['trackID','title','predicted_genre']]
NN_test_pred_op['predicted_genre'] = NN_test_pred_op['predicted_genre']
print('Successfully stored NN model predictions')
return test_pred_op,NN_test_pred_op
features,labels,test_data = load_data()
X_train,Y_train,X_test = get_dataset()
test_pred_op,NN_test_pred_op = make_predictions(X_test,labels)
Please refer the src folder for web application for this model implementation. I have implemented for both the models, so that we can check for correct data points from both the models. Eventhough NN model accuracy is low, we can use it to the correctly classfied genres.
Future work,